/******************************************************************************
* This program calcualtes the StockTwit Messages and Replies Counts, Disagreement
* StockTwits Centrality Measurse
******************************************************************************/

/******************************************************************************
                             Get Permno-rdq1 [s, t]
******************************************************************************/

  proc sql;
    create table permno_rdq1
    as select a.*, b.number
    from (select distinct permno, rdq1, fyearq, fqtr
    from rep.comp_final_check_rqd1_18) as a,
         rep.tdays_2018 as b
    where a.rdq1 = b.date and a.permno ne .
    order by permno, rdq1;
    quit;
  ** Check how many trading days are between the current rdq1 and the next rdq1;
    * In case there are missing reports for the previous or the next qtr, set
      these varaibels to be missing as well;
    * First, start with next rdq1;
    proc sort data = permno_rdq1;
      by permno descending fyearq descending fqtr descending rdq1;
      run;
    data permno_rdq1;
      set permno_rdq1;
      by  permno descending fyearq descending fqtr descending rdq1;
      evttime_nrdq1 = lag(number) - number;
      next_rdq1 = lag(rdq1);
      if (fyearq - lag(fyearq))*4+ fqtr - lag(fqtr) ne -1 then do;
        evttime_nrdq1 = .;
        next_rdq1 = .;
      end;
      if first.permno then do;
        evttime_nrdq1 = . ;
        next_rdq1 = .;
      end;
      format next_rdq1 date7.;
      run;
    * Second, calcualte the lag rdq1;
    proc sort data = permno_rdq1;
      by permno  fyearq  fqtr  rdq1;
      run;
    data permno_rdq1;
      set permno_rdq1;
      by  permno fyearq fqtr rdq1;
      evttime_lrdq1 = lag(number) - number;
      lag_rdq1 = lag(rdq1);
      if (fyearq - lag(fyearq))*4+ fqtr - lag(fqtr) ne 1 then do;
        evttime_lrdq1 = .;
        lag_rdq1 = .;
      end;
      if first.permno then do;
        evttime_lrdq1 = . ;
        lag_rdq1 = .;
      end;
      format lag_rdq1 date7.;
      run;



*** Then, for each permno-rdq1 pair, get day -41, -11, -10, -3, -2, -1, 0, 1,
2, 3, 5, 10, 20, 30, 40, 50, 60, and 61 (i.e., 95);


  proc sql;
    create table permno_rdq1
    as select distinct a.*, max(a.lag_rdq1, b.date) as day_41,
      max(a.number+evttime_lrdq1, b.number) as number_41
    from permno_rdq1 as a left join rep.tdays_2018 as b
    on a.number - b.number = 41;


    create table permno_rdq1
    as select distinct a.*, max(a.lag_rdq1, b.date) as day_11,
      max(a.number+evttime_lrdq1, b.number) as number_11
    from permno_rdq1 as a left join rep.tdays_2018 as b
    on a.number - b.number = 11;

    create table permno_rdq1
    as select distinct a.*, max(a.lag_rdq1, b.date) as day_1,
      max(a.number+evttime_lrdq1, b.number) as number_1
    from permno_rdq1 as a left join rep.tdays_2018 as b
    on a.number - b.number = 1;


    create table permno_rdq1
    as select distinct a.*, b.date as day95,
      b.number as number61
    from permno_rdq1 as a left join rep.tdays_2018 as b
    on a.number+a.evttime_nrdq1 -5 = b.number
    order by permno, rdq1;

    quit;


/******************************************************************************
                            Get Daily Reply and All Messages
******************************************************************************/
* Import the pre-processed messages; 
* These datasets has the following strcutures:
  - permno
  - rdq1
  - day_*
  - number_*
  - message_id
  - date (of the message_id)
  - indirect_message_id
  - indirect_date (of the indirect_message_id)
  - direct_message (indicator for parent messages);
* For each parent message (message_id), all its offspring messages
* (indirect_message_id) including itself are all included;

data sample;
  set elog.sample1-elog.sample11;
  run;

%macro get_daily_messages_count(datain = , dataout = , count_name = );

  proc sql;
    create table &dataout as
    select distinct
       permno,
       rdq1,
       fyearq,
       fqtr,
       indirect_date as date,
       count(distinct indirect_message_id) as &count_name
       from &datain
    group by permno, rdq1, fyearq, fqtr, indirect_date
    order by permno, rdq1, fyearq, fqtr, indirect_date;
    quit;

  proc sql;
    create table &dataout as select distinct a.*, b.*
    from permno_rdq1 as a, &dataout as b
    where a.permno = b.permno and a.rdq1 = b.rdq1 and
          a.fyearq = b.fyearq and a.fqtr = b.fqtr;
    quit;

  %mend;

%get_daily_messages_count(datain = sample(where = (sp=1 and direct_message =
  0)), dataout = indirect_mentions, count_name = imen_sp);


/******************************************************************************
                             Get daily New Messages
******************************************************************************/
* This is dirmenas;
* Need the cleaned stocktwits data symbol_msg1;

*** Asocial direct mentions;

  * It is nearly impossible to distinguish the first-time mention of a stock by a user versus the
    repeated discussions ;
  * But it is possible to classify direct symbol mentions into social versus asocial;

  * Asocial direct mentions (asdirmen) is identified if a direct mention
    (contains no @) and
    (either a parent message or the one without replies) and
    (was not @ed by a previous same-stock mention);

  * All messages that mentioned the symbol;

   proc sql;
      create table direct_mentions
      as select distinct a.*, b.message_id, b.message_time_nyc, b.date,
        b.parent_message_id, b.in_reply_to_message_id, b.parent, b.user_id,
        b.at_user_id, b.symbol, count(message_id) as check_n,
        log(1+user_followers) as user_weight
      from permno_rdq1 as a, symbol_mssg1 as b
      where a.permno =b.permno and a.day_41<=b.date<=a.day61
      group by a.permno, a.rdq1, a.fyearq, a.fqtr, b.date, b.message_id,
      b.at_user_id
      order  by a.permno, a.rdq1, a.fyearq, a.fqtr, b.date, b.message_id,
      b.at_user_id;
      quit; * 10,264,023 obs;


  * Was it in response to a previous same-symbol mention? ;

  proc sql;
      create table direct_mentions_1 as
      select distinct a.*, b.message_id as pre_at_message_id,
        b.symbol as pre_at_symbol
      from direct_mentions as a left join symbol_mssg1 as b
      on a.symbol = b.symbol and
         a.user_id = b.at_user_id and
         (datepart(b.message_time_nyc) between a.day_41 and a.day61) and
         a.message_time_nyc> b.message_time_nyc
      group by a.permno, a.rdq1, a.fyearq, a.fqtr, b.date, b.message_id,
      b.at_user_id
      order  by a.permno, a.rdq1, a.fyearq, a.fqtr, b.date, b.message_id,
      b.at_user_id;
      quit; * 10,264,023 obs;

  data direct_mentions_1;
    set direct_mentions_1;

    response = 0;
    if not missing(pre_at_message_id) then response = 1;
    drop reponse pre_message_id;
    run ;

  proc sort data = direct_mentions_1 nodupkey;
    by permno rdq1 fyearq fqtr date message_id at_user_id response;
    run;


      data direct_mentions_1;
        set direct_mentions_1;


        asocial = 0 ;
        if (missing(at_user_id)) and
           (parent = 1 or missing(parent)) and
           response ne 1
        then asocial = 1;


        run;


*** Count daily mentions by types;

  * Generate a time_window variable: 1 indicates [-41,-11] and 2
    indicates [0,61];


  proc sql;
    create table direct_messages_1 as
    select distinct
           permno,
           rdq1,
           fyearq,
           fqtr,
           date,
           count(distinct message_id) as dirmenas, 
           sum(exp(user_weight)) as dirmenas_rawweighted
    from direct_mentions_1
   where asocial =1 /* Toggle this to generate social versus nonsocial */
    group by permno, rdq1, fyearq, fqtr, date
    order by permno, rdq1, fyearq, fqtr, date;
    quit;


  proc sql;
    create table direct_messages_1 as select distinct a.*, b.*
    from permno_rdq1 as a, direct_messages_1 as b
    where a.permno = b.permno and a.rdq1 = b.rdq1 and
          a.fyearq = b.fyearq and a.fqtr = b.fqtr;
    quit;

/******************************************************************************
                        Get Daily Disagreement Measures 
******************************************************************************/

%macro get_daily_disagreement(datain = , dataout = , dis_name = );

  proc sql;
    create table &dataout as
    select distinct
       permno,
       rdq1,
       fyearq,
       fqtr,
       indirect_date as date,
       std(ppos) as &dis_name
       from &datain
    group by permno, rdq1, fyearq, fqtr, indirect_date
    order by permno, rdq1, fyearq, fqtr, indirect_date;
    quit;

  proc sql;
    create table &dataout as select distinct a.*, b.*
    from permno_rdq1 as a, &dataout as b
    where a.permno = b.permno and a.rdq1 = b.rdq1 and
          a.fyearq = b.fyearq and a.fqtr = b.fqtr;
    quit;

  %mend;

%get_daily_messages_count(datain = sample(where = (sp=1)), dataout =dis_indirect_mentions, 
  dis_name = imen_ppos_std_sp); 
%get_daily_messages_count(datain = sample(where = (sp=1 and direct_message = 0)), 
  dataout = dis_allmessages, dis_name = amen_ppos_std_sp);

/******************************************************************************
                             Get Window Averages
******************************************************************************/


data various_mentions;
  merge indirect_mentions
        direct_messages_1
        dis_allmessages
        dis_indirect_mentions
        ;
  by permno rdq1 fyearq fqtr date;
  run;


proc sql;
  create table time_series as
    select distinct *
    from various_mentions (drop = imen: dirmen: date day_number);
    quit;

  proc sql;
    create table time_series as
    select distinct
           a.*,
           b.date,
           b.number as day_number
    from time_series as a left join
         rep.tdays_2018 as b
    on b.date between a.day_41 and a.day61;
    order by a.permno, a.rdq1, a.fyearq, a.fqtr, b.date;
    quit;

  * Merge and fill in zeros for missing dates;

  data various_mentions;
    merge time_series
          indirect_mentions
          direct_messages_1;
    by permno rdq1 fyearq fqtr date;
    array x(*) dirmenas imen_sp dirmenas_rawweighted;
    do i = 1 to dim(x);
      if missing(x(i)) then x(i) = 0;
      x(i) = log(1+x(i));
    end;
    arry y(*) imen_ppos_std_sp amen_ppos_std_sp;
    do i = 1 to dim(y);
      if missing(y(i)) then y(i) = .;
    end;
    if not missing(day_number); * Only pick trading days;
    drop i  _name_;
    run;



*** Get average values by windows ***;

  proc means data = various_mentions noprint;
    by permno rdq1 fyearq fqtr; 
    var dirmenas imen_sp; 
    output out = v1 mean= dirmenas_pre imen_sp_pre;
    where date between day_41 and day_11; 
    run;  

  proc means data = various_mentions noprint;
    by permno rdq1 fyearq fqtr; 
    var dirmenas imen_sp amen_ppos_std_sp imen_ppos_std_sp dirmenas_rawweighted; 
    output out = v2 mean= dirmenas_01 imen_sp_01 amen_ppos_std_sp_01 imen_ppos_std_sp_01;
    where date between day0 and day1; 
    run;  

  proc means data = various_mentions noprint;
    by permno rdq1 fyearq fqtr; 
    var dirmenas imen_sp amen_ppos_std_sp imen_ppos_std_sp; 
    output out = v3 mean= dirmenas_295 imen_sp_295 amen_ppos_std_sp_295 imen_ppos_std_sp_295;
    where date between day2 and day95; 
    run;  

/******************************************************************************
                            Persistence of Disagreement 
******************************************************************************/
%macro persis_absvi(input = , svinames = , out = );
    data time_series4;
      set &input;
      keep rdq1 permno fyearq fqtr &svinames date;
      if day0<=date<=day95;  
      run; 

    option nonotes; 
    proc iml;
    use time_series4;
    read all var{permno rdq1 fyearq fqtr} into firm;
    %do i=1 %to %sysfunc(countw(&svinames));
      %let name&i = %scan(&svinames, &i, %str( ));
      read all var{&&name&i} into v&i;
    %end;/*read all var{vomh_ab} into vomh;*/
    close;
    %let svinames_lgth = %sysfunc(countw(&svinames));
    %let n1 = %eval(&svinames_lgth+1);
    %let n2 = %eval(&svinames_lgth+4);
    b = uniqueby(firm, 1:4);  
    lala = j(nrow(b),&n2, .);      /* 3. Allocate vector to hold results*/
    lala[,&n1:&n2] = firm[b,1:4];
    b = b // (nrow(firm)+1);     /* trick: append (n+1) to end of b */
    do i = 1 to nrow(b)-1;    /* 4. For each level... */
    /*do i = 1 to 1000;*/
       idx = b[i]:(b[i+1]-1); /* 5. Find observations in level */
       if (b[i+1]-b[i])>=30 then do;
        %do j=1 %to %sysfunc(countw(&svinames));
           x = v&j[idx]; 
           if COUNTUNIQUE(x) >= 5 then do;  
           x = x[loc(x^=.)];  
           v&j._dm = x-mean(x); 
             call farmafit(d1,phi,ma,sigma,v&j._dm); 
             lala[i,&j] = d1; 
           end; 
        %end;
       end;
    end;


    create mydat4 from lala;
    append from lala;
    close mydat4;

    quit;
    option notes;
    data &out; set mydat4; 
      %let n3 = %eval(&n1+1); 
      %let n4 = %eval(&n1+2); 
      rename col&n1 = permno
             col&n3 = rdq1
             col&n4 = fyearq
             col&n2 = fqtr;
      format col&n2 date9.;
       %do i =1 %to %sysfunc(countw(&svinames));  
          %let name&i = %scan(&svinames, &i, %str( ));
          rename col&i = d_&&name&i;
       %end;
      run;
    proc sort data = &out; by permno rdq1 fyearq fqtr; run ;
    %mend;

%let list = amen_ppos_std_sp imen_ppos_std_sp;
%persis_absvi(input = various_mentions, svinames = &list, out = d_mentions_all );

/******************************************************************************
                        StockTwits Centraility Measures
******************************************************************************/
* Need the hand-collected messages dataset symbol_mssg1 linked to permno;
* Include only trading day counts;
proc sql;
  create table symbol_mssg1 as
  select distinct
    a.*,
    b.number as tday_number
  from symbol_mssg1 as a left join
       rep.tdays_2018 as b
  on a.date = b.date
  order by a.permno, a.date;
  quit;

data symbol_mssg1; set symbol_mssg1; count_tick = 1; run;

* Past 3month messages (61 trading days leading up to day -11);
proc sql;

  create table scen as
  select distinct
    a.*,
    sum(count_tick) as total_msg_past_3m
  from permno_rdq1 as a left join
       symbol_mssg1 as b
  on a.permno = b.permno and a.number_11 - 61 <=b.tday_number<=a.number_11 and
     not missing(b.tday_number)
  group by a.permno, a.rdq1, a.fyearq, a.fqtr;

  quit;


/******************************************************************************
                                Combine and Save
******************************************************************************/

data rep.stocktwits;
  merge v1-v3 d_mentions_all scen;
  by permno rdq1 fyearq fqtr;
  ab_dirmenas_01 = dirmenas_01 - dirmenas_pre;
  ab_imen_sp_01 = imen_sp_01 - imen_sp_pre;
  ab_dirmenas_295 = dirmenas_295 - dirmenas_01;
  ab_imen_sp_295 = imen_sp_295 - imen_sp_pre;
  infl = dirmenas_rawweighted_01 - dirmenas_01;
  keep permno rdq1 fyearq fqtr ab_dirmenas_01 ab_imen_sp_01 ab_dirmenas_295
        ab_imen_sp_295 infl total_msg_past_3m;
  run;
